Load Libraries & Set Configurations/ Permissions

In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
%matplotlib inline
from libraries_bert import *

np.random.seed(0)
sns.set_style('white')

with open('G:\\aws_processed\\info.json', 'r') as f:
    d = json.load(f)
    username = d['plotly_username']
    api_key = d['plotly_api_key']

auth = HTTPBasicAuth(username, api_key)
headers = {'Plotly-Client-Platform': 'python'}
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

Load & Merge Data

Return to Index

In [5]:
data = pd.read_json('restaurant1_english_sentiment.json', orient='columns',
                    convert_dates=True)

location = pd.read_json('locations.json', orient='columns')
location.drop(['address_line_1','address_line_2'], inplace=True, axis=1)

# Get relevant columns from dataset
data_cut = data[['review_id','store_id','platform_name', 'clean_text','combined', 
                 'bert_label', 'rating', 'bert_pos','bert_neg', 'date']].copy()

# merge location and review data
full_data = data_cut.merge(location, on='store_id', how='left')

# split review data by sentiment
full_data_pos = full_data.loc[full_data.bert_label == 'pos'].copy()
full_data_neg = full_data.loc[full_data.bert_label == 'neg'].copy()

full_data_pos.reset_index(inplace=True, drop=True)
full_data_neg.reset_index(inplace=True, drop=True)

len(full_data_pos), len(full_data_neg)
Out[5]:
(251903, 28195)
In [6]:
# Identifying location info has not been outputted to ensure privacy. 
full_data_pos[['review_id', 'store_id', 'platform_name', 'clean_text', 'combined',
               'bert_label', 'rating', 'bert_pos', 'bert_neg', 'date', 
               'country', 'city', 'brand_id']].head()
Out[6]:
review_id store_id platform_name clean_text combined bert_label rating bert_pos bert_neg date country city brand_id
0 585315 288 GoReview lungelo Lungelo pos 5 0.998606 0.000862 2018-10-12 15:37:38+00:00 ZA Roodepoort 1.0
1 585319 288 GoReview wizzy Wizzy pos 5 0.998829 0.000673 2018-10-12 15:41:20+00:00 ZA Roodepoort 1.0
2 77301 55 TripAdvisor love to visit spur in white river service is g... Love to visit Spur in White River - service is... pos 4 0.999053 0.000137 2017-01-17 01:11:51+00:00 ZA Nelspruit 1.0
3 585187 111 GoReview elijah funny upbeat personality and very fast... #Elijah Funny, upbeat personality and very fa... pos 5 0.998937 0.000251 2018-10-12 14:12:26+00:00 ZA Johannesburg South 1.0
4 585323 223 GoReview paddy Paddy pos 5 0.995791 0.002897 2018-10-12 15:44:00+00:00 ZA Benoni 1.0

Extract Menu Entities from Reviews

Return to Index

Match patterns were created with SpaCy and used with their EntityRulermethod to extract the menu items from the review text. The flexibility of their patterns - the ability to use parts-of-speech, to create complex matches with lists of possible words, and the option to exclude certain words, makes this a powerful option.

The patterns were created from the menu and adjusted to include the most common mispellings and phrasings. Around 3000 reviews were manually inspected using Prodigy annotation software.


Visual Example of Entity Patterns & Extraction

In [7]:
match_patterns = []

with open(".../menu_seeds2.jsonl", "r") as read_file:
    for line in read_file:
        match_patterns.append(json.loads(line))
        
match_patterns[:5]
Out[7]:
[{'label': 'MENU',
  'pattern': [{'LOWER': {'IN': ['small',
      'starter',
      'buffalo',
      'chicken',
      'portion',
      'share',
      'shared',
      'large',
      'full',
      'portion',
      'spicy',
      'hot',
      'bbq']},
    'OP': '*'},
   {'LOWER': 'of', 'OP': '?'},
   {'LOWER': {'IN': ['wings', 'wing', 'wingz', 'winglet', 'winglets']}},
   {'LOWER': {'IN': ['starters', 'starter', 'full', 'portion']}, 'OP': '*'}],
  'id': 'chicken'},
 {'label': 'MENU',
  'pattern': [{'LOWER': {'IN': ['small',
      'starter',
      'shared',
      'large',
      'full',
      'portion']},
    'OP': '*'},
   {'LOWER': 'of', 'OP': '?'},
   {'LOWER': {'IN': ['chicken', 'crumbed', 'kid', 'kids']}, 'OP': '+'},
   {'LOWER': {'IN': ['strips', 'strip']}},
   {'LOWER': {'IN': ['starters', 'starter', 'full', 'portion', 'large']},
    'OP': '*'}],
  'id': 'chicken'},
 {'label': 'MENU',
  'pattern': [{'LOWER': {'IN': ['small',
      'starter',
      'shared',
      'large',
      'full',
      'portion',
      'side']},
    'OP': '*'},
   {'LOWER': 'of', 'OP': '?'},
   {'LOWER': {'IN': ['chicken']}, 'OP': '?'},
   {'LOWER': {'IN': ['liver', 'livers']}},
   {'LOWER': {'IN': ['starters', 'starter', 'side']}, 'OP': '?'}],
  'id': 'chicken'},
 {'label': 'MENU',
  'pattern': [{'LOWER': {'IN': ['cheese',
      'cheesy',
      'small',
      'starter',
      'mexican',
      'share',
      'shared',
      'large',
      'full',
      'portion']},
    'OP': '*'},
   {'LOWER': 'of', 'OP': '?'},
   {'LOWER': 'chicken'},
   {'LOWER': 'quesadilla'},
   {'LOWER': {'IN': ['starters', 'starter']}, 'OP': '?'}],
  'id': 'chicken'},
 {'label': 'MENU',
  'pattern': [{'LOWER': {'IN': ['cheese',
      'cheesy',
      'small',
      'starter',
      'mexican',
      'share',
      'shared',
      'large',
      'full',
      'portion']},
    'OP': '*'},
   {'LOWER': 'quesadilla'},
   {'LOWER': 'with'},
   {'LOWER': 'chicken'},
   {'LOWER': {'IN': ['strips', 'strip']}, 'OP': '?'}],
  'id': 'chicken'}]
In [8]:
nlp = en_core_web_lg.load()
nlp.remove_pipe("ner")

ruler = EntityRuler(nlp)

ruler.add_patterns(match_patterns)
nlp.add_pipe(ruler)

# settings for displacy visualizer
colors = {"MENU": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["MENU"], "colors": colors}

# visualize the entities highlighted in the text and 
# the associated ids (i.e., menu subcategories)
test_text = full_data_neg.clean_text[:50]

for doc in list(nlp.pipe(test_text)):
    if len(doc.ents) >= 1:
        for ent in doc.ents:
            print(ent.text + ' --> ' + ent.ent_id_)
        displacy.render(doc, style="ent", options=options)
        print('-'*100)
steak --> steaks
mashroom sauce --> sauces
baked potato --> side
onions --> ingredients
i ordered a sirlon steak MENU with mashroom sauce MENU with baked potato MENU and no onions MENU and decided to add caslaw with my meal the menu said add 5 for the extra only to pay more for the extra than wat was advised i will not go back the the service was terrible also the managers were not helpful at all
----------------------------------------------------------------------------------------------------
meat --> meat
chicken --> chicken
ribs --> ribs
spinach --> side
cream spinach --> side
cocktail juice --> drinks
food tastes terrible like plastic ie meat MENU very tasteless chicken MENU and ribs MENU over refrigerated poor ingredients on spinach MENU not as per standard cream spinach MENU we used to cocktail juice MENU very week and tastes like there is water
----------------------------------------------------------------------------------------------------
rib burger --> burger
chips --> side
oil --> ingredients
burger buns --> burger
ribs --> ribs
had lunch there yesterday ordered the rib burger MENU meal and i was disappointed with what i got the chips MENU were not crispy but crunchy they tasted like they were fried in stale oil MENU the burger buns MENU were stale and the ribs MENU were burnt on the outside and raw on the inside pink raw i am never eating there again
----------------------------------------------------------------------------------------------------
burger specials --> specials
very hungry and undecided about which burger specials MENU to follow we eventually went with trusty ole spur and golden bay which we have frequented expecting crowds and ques for buy 1 get 1 free we were starved
----------------------------------------------------------------------------------------------------
breakfast --> breakfast
breakfast --> breakfast
cheesegriller --> sausage
toast --> side
butter --> ingredients
jam --> condiment
we were on our way back from holiday very tired and hungry we passed through for breakfast MENU our breakfast MENU came without the extra cheesegriller MENU we ordered the toast MENU butter MENU and jam MENU were also everything went wrong
----------------------------------------------------------------------------------------------------
liver --> chicken
the experience was ok suggestion more options for the play area like more games try to upgrade it to playstation or xbox for the kids they were practically asleep when they were singing happy birthday for the children one dish a liver MENU dish was particularly poor it was like it was made yesterday or even before that
----------------------------------------------------------------------------------------------------
monday burgers special --> specials
burger --> burger
bun --> burger
patty --> burger
went with children and friends to a monday burgers special MENU and it was a disappointment the burger MENU is so dry due to the dry bun MENU and the overcooked patty MENU the frills are standard so no problem there specials a disappointment
----------------------------------------------------------------------------------------------------
monday special --> specials
sauce --> sauces
milkshake --> dessert
well i went for my usual monday special MENU yesterday and i was not impressed at all i went early so the place was not as full yet and my food took forever to come the waiter was also extremely slow i asked her for strong sauce MENU as well as to bring my milkshake MENU with the meal and both came at the end of the meal
----------------------------------------------------------------------------------------------------
sauces --> sauces
more sauces MENU on the table
----------------------------------------------------------------------------------------------------
jams --> condiment
jam --> condiment
tomoto sauce --> condiment
do not like the jams MENU served in the bottles battled to get the jam MENU out and very messy can easily mistaken for tomoto sauce MENU
----------------------------------------------------------------------------------------------------
bacon --> pork
bacon MENU was a little salty charmaine
----------------------------------------------------------------------------------------------------
bottomless coffee --> beverage
bring back the bottomless coffee MENU
----------------------------------------------------------------------------------------------------
ice cream --> dessert
i would like to say that the star lighter at spur must be made standard to every store because our kids like that more than the ice cream MENU and may the plastics be branded for take that are left overs we are also entitled to get a branaded plastic thank you
----------------------------------------------------------------------------------------------------
coffee --> beverage
waited long for coffee MENU and was not very hot food was medium hot waiter attempted to remove plates when petson was still eating teached for plate over this person
----------------------------------------------------------------------------------------------------
wors --> sausage
worth it to be around however no pap n wors MENU around please make it happen tx
----------------------------------------------------------------------------------------------------
monday night burgers --> specials
onions --> ingredients
worst service ever waiters so blas went for monday night burgers MENU as kids were looking forward to it got there 8 o clock food arrived 9 15 onions MENU looked like it was just thrown in a pan for a minute i love spur but hermanus spur will never see me again
----------------------------------------------------------------------------------------------------
bottomless ribs and wings --> specials
there is a huge sign on the wall near the entrance to this spur outlet at paarl mall announcing bottomless ribs and wings MENU on thursday s my wife and i were in the mall yesterday and being a misleading advertising
----------------------------------------------------------------------------------------------------
drinks --> drinks
drinks --> drinks
had been having an excellent evening up until9 55 when we wanted to order our last drinks MENU before we leave we ordered with our waiter a last round of drinks MENU together with the bill ofwhich he only bad treatment from the manager pola
----------------------------------------------------------------------------------------------------
worcestershire sauce --> condiment
does not stock coleman s worcestershire sauce MENU which is msg free whereas maggie s contain msg
----------------------------------------------------------------------------------------------------
wings --> chicken
it is was fine until my points do not reflect on my card anyway buffallo wings MENU are devine
----------------------------------------------------------------------------------------------------
breakfasts --> breakfast
we waited 2 5 hours for 3 small breakfasts MENU and the service was not good at all
----------------------------------------------------------------------------------------------------
croutons --> ingredients
salad --> vegetarian
salad --> vegetarian
croutons --> ingredients
oil --> ingredients
always come her and did got croutons MENU in my salad MENU how can you make a salad MENU with no croutons MENU when it come to the table it was oil MENU and no taste s
----------------------------------------------------------------------------------------------------
beef ribs --> ribs
burgers --> burger
patties --> burger
roll --> burger
waffle --> dessert
just average one of the few spurs where you can order beef ribs MENU was tasteless and overcooked nothing to write home about we also ordered burgers MENU but have the patties MENU shrunk either that or it was an oversized roll MENU service just abysmal wait to order a drink wait to find someone to get the bill wait to get anyone s attention and it was not even a monday night what i did enjoy was the famous log cabin waffle MENU always a treat
----------------------------------------------------------------------------------------------------
red meat --> meat
fish and seafood --> seafood
me and my husband is on holiday we ate there yesterday because we craved for red meat MENU the most restaurants over there serve fish and seafood MENU was nuce but could be better
----------------------------------------------------------------------------------------------------
In [9]:
%%time

def get_ents(df, col, patterns):
    """
    Return a dictionary of menu entities from SpaCy EntityRuler, 
    based on match patterns.
    
    params:
    df: Dataframe with text reviews.   
    col: Column to apply the EntityRuler on. 
    patterns: Entity patterns to use for extraction. 
    """
    
    nlp = en_core_web_lg.load()
    nlp.remove_pipe("ner")

    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    
    menu_ents = []
    for r_id, doc in zip(df.review_id.tolist(), list(nlp.pipe(df[col].tolist()))):
        ent_ids = []
        ent_text = []
        for ent in doc.ents:
            ent_ids.append(ent.ent_id_)
            ent_text.append(ent.text)
        menu_ents.append({"text": ent_text, "id": ent_ids, "review_id": r_id})
    return menu_ents

pos_menu_ents = get_ents(full_data_pos, 'clean_text', match_patterns)
neg_menu_ents = get_ents(full_data_neg, 'clean_text', match_patterns)

len(pos_menu_ents), len(neg_menu_ents)
Wall time: 10min 34s
Out[9]:
(251903, 28195)

Combine Entities with Review Data

In [10]:
def get_menu_lists(ent_dict):
    """Extract lists of menu items and menu categories
    from the entity dict."""
    menu_items = []
    menu_cats = []

    for review in ent_dict:
        menu_items.append(review['text'])
        menu_cats.append(review['id'])
        
    return menu_items, menu_cats

menu_items_pos, menu_cats_pos = get_menu_lists(pos_menu_ents)
menu_items_neg, menu_cats_neg = get_menu_lists(neg_menu_ents)

full_data_pos['menu_items'] = pd.Series(menu_items_pos)
full_data_pos['menu_cats'] = pd.Series(menu_cats_pos)

full_data_neg['menu_items'] = pd.Series(menu_items_neg)
full_data_neg['menu_cats'] = pd.Series(menu_cats_neg)

full_data_neg[['menu_items', 'menu_cats']].head()
Out[10]:
menu_items menu_cats
0 [steak, mashroom sauce, baked potato, onions] [steaks, sauces, side, ingredients]
1 [] []
2 [] []
3 [meat, chicken, ribs, spinach, cream spinach, ... [meat, chicken, ribs, side, side, drinks]
4 [rib burger, chips, oil, burger buns, ribs] [burger, side, ingredients, burger, ribs]

Count Menu Items and Categories

In [11]:
full_data_neg['num_items'] = [len(full_data_neg['menu_items'][i]) for i in full_data_neg.index]
full_data_neg['num_cats'] = [len(set(full_data_neg['menu_cats'][i])) for i in full_data_neg.index]

# menu categories are only counted once per review
full_data_pos['num_items'] = [len(full_data_pos['menu_items'][i]) for i in full_data_pos.index]
full_data_pos['num_cats'] = [len(set(full_data_pos['menu_cats'][i])) for i in full_data_pos.index]

full_data_neg[['clean_text','menu_items', 'menu_cats', 'num_items', 'num_cats']].head()
Out[11]:
clean_text menu_items menu_cats num_items num_cats
0 i ordered a sirlon steak with mashroom sauce w... [steak, mashroom sauce, baked potato, onions] [steaks, sauces, side, ingredients] 4 4
1 lower your prices [] [] 0 0
2 we have visited this restaurant many times in ... [] [] 0 0
3 food tastes terrible like plastic ie meat very... [meat, chicken, ribs, spinach, cream spinach, ... [meat, chicken, ribs, side, side, drinks] 6 5
4 had lunch there yesterday ordered the rib burg... [rib burger, chips, oil, burger buns, ribs] [burger, side, ingredients, burger, ribs] 5 4

Questions Displayed Visually

Return to Index

Are there more menu item mentions in negative or positive reviews?

In [24]:
def plot_comparison(df1, df2, col, normalize=True, title=None):
    """Plot comparison for the same feature in two separate dataframes."""
    d1 = {}
    d2 = {}
    if normalize:
            d1['no categories'] = df1[col].value_counts(normalize=True)[0]*100
            d1['categories'] = sum(df1[col].value_counts(normalize=True)[1:-1]*100)
            d2['no categories'] = df2[col].value_counts(normalize=True)[0]*100
            d2['categories'] = sum(df2[col].value_counts(normalize=True)[1:-1]*100)
    else:
        d1['no_cats'] = df1[col].value_counts()[0]
        d1['cats'] = sum(df1[col].value_counts()[1:-1])
        d2['no_cats'] = df2[col].value_counts()[0]
        d2['cats'] = sum(df2[col].value_counts()[1:-1])
    
    X = np.arange(len(d1))
    fig = plt.figure(figsize=(10, 6))
    ax = plt.subplot(111)
    ax.bar(X, d1.values(), width=0.2, color='powderblue', align='center')
    ax.bar(X-0.2, d2.values(), width=0.2, color='salmon', align='center', alpha=0.8)
    ax.legend(('Positive Reviews','Negative Reviews'))
    plt.xticks(X, d1.keys(), fontsize=12)
    plt.title(title, fontsize=17)

plot_comparison(full_data_pos, full_data_neg, 'num_items', normalize=True,
                    title = '% of Reviews that Mention Menu Items')

Do average menu item mentions differ by platform?

In [30]:
fig = plt.figure(figsize=(12, 6))
full_data_neg.groupby(['platform_name'])['num_items'].mean()\
                                                     .sort_values()\
                                                     .plot(kind='bar')
plt.xticks(np.arange(7), rotation=0, fontsize=12)
plt.xlabel(None)
plt.title('Average Number of Named Items per Negative Review by Platform',
          fontsize=14);
In [31]:
fig = plt.figure(figsize=(12, 6))
full_data_pos.groupby(['platform_name'])['num_items'].mean()\
                                                     .sort_values()\
                                                     .plot(kind='bar')
plt.xticks(np.arange(7), rotation=0, fontsize=12)
plt.xlabel(None)
plt.title('Average Number of Named Items per Positive Review by Platform',
          fontsize=14);

Which menu sub-categories are frequently mentioned in reviews?

In [100]:
def get_category_freq(df, col):
    """Create a dictionary of category frequencies."""
    
    cats_text = df[col].tolist().copy()
    cats_tokens = list(itertools.chain.from_iterable(cats_text))

    fdist = FreqDist(cats_tokens)

    cats_freq = dict()
    for word, frequency in fdist.items():
        cats_freq[word] = frequency
    return cats_freq

neg_cats_freq = get_category_freq(full_data_neg, 'menu_cats')
pos_cats_freq = get_category_freq(full_data_pos, 'menu_cats')
all_cats_freq = {k: neg_cats_freq.get(k, 0) + pos_cats_freq.get(k, 0) for k in set(neg_cats_freq) | set(pos_cats_freq)}
In [43]:
def plot_menu_cats(freq_dict, title=None, normalize=True):
    """Plot category frequencies from a frequency dict."""
    
    df = pd.DataFrame(list(freq_dict.items()), columns=['categories', 'frequency'])
    
    if normalize:
        df['percentage'] = (df.frequency/sum(df.frequency))*100
    
        # sort df by Count column
        df = df.sort_values(['percentage'], ascending=False).reset_index(drop=True)

        plt.figure(figsize=(12,10))
        # plot barh chart with index as x values
        ax = sns.barplot(x="percentage", y="categories", data=df)
        ax.set_xlabel("Percentage of Menu Category Mentions in Reviews", fontsize=12)
        
    else:
        # sort df by Count column
        df = df.sort_values(['frequency'], ascending=False).reset_index(drop=True)
        plt.figure(figsize=(12,10))
        # plot barh chart with index as x values
        ax = sns.barplot(x="frequency", y="categories", data=df)
        ax.set_xlabel("Frequency of Menu Category Mentions in Reviews", fontsize=12)

    
    ax.set_ylabel('Menu Categories', fontsize=12)
    ax.tick_params(axis="y", labelsize=12)
    ax.set_title(label=title, fontsize=14);
    
plot_menu_cats(neg_cats_freq, title='Percentage of Menu Category Mentions in Negative Reviews',
               normalize=False)
In [44]:
plot_menu_cats(pos_cats_freq, title='Percentage of Menu Category Mentions in Positive Reviews',
              normalize=False)
In [45]:
plot_menu_cats(all_cats_freq, title='Percentage of Menu Category Mentions in All Reviews',
              normalize=True)

Do menu item frequencies change over time?

In [101]:
def plotly_plot(df_cut, title, yaxis=None, xaxis=None, filename=None):
    """Plot an interactive plotly chart from a specified dataframe."""
    
    fig = go.Figure()

    for col in df_cut.columns:
        fig.add_trace(go.Scatter(
            x=df_cut.index,
            y=df_cut[col],
            name=col))

    fig.update_layout(
        title=title,
        xaxis_title='Time',
        yaxis_title=yaxis,
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"))

    fig.show()
    
    if filename:
        url=py.iplot(fig, filename=filename, sharing='public', auto_open=False)
        plotly.offline.plot(fig, filename=filename + '.html', auto_open=False)

def plot_category_trends(data, item_col, cat_cols, filename=None, 
                         title=None, normalize=True, 
                         year_start='2011', year_end='2019', time='Y'):
    """Plot time series data for categories."""
    
    start = year_start + '-12-31'
    end = year_end + '-12-31'
            
    df = data.set_index(pd.to_datetime(data.date).dt.normalize(), drop=True)
    mask = (df['date'] > start) & (df['date'] <= end)
    df = df.loc[mask]
    
    df.drop('date', inplace=True, axis=1)
    df = df[item_col].str.join('|').str.get_dummies()
    
    if normalize:
        year_gr = df.resample(time).sum()
        yaxis = 'Percentage'
        # taken as a % of reviews in that year
        year_gr['total'] = year_gr.sum(axis=1)
        for col in year_gr.columns:
            year_gr[col] = round((year_gr[col]/year_gr['total'])*100, 2)
    else:
        year_gr = df.resample(time).sum()
        yaxis='Frequency'
        
    df_cut = year_gr[cat_cols].copy()
    
    plotly_plot(df_cut, title, yaxis, filename)
    

plot_category_trends(full_data_pos, 'menu_cats', 
                ['breakfast', 'side', 'steaks', 'burger', 'ribs', 
                 'chicken', 'dessert', 'sauces','specials'], 
                #filename='menu_cats_positive.png',
                title = "Trends for Menu Categories in Positive Reviews", 
                normalize=True, year_start='2016', year_end='2018', time='M')    
In [102]:
plot_category_trends(full_data_neg, 'menu_cats', 
                     ['side', 'steaks', 'burger', 'breakfast', 'ribs', 'chicken', 
                      'dessert','sauces', 'specials'],
                     title = "Trends for Menu Categories in Negative Reviews", 
                     year_start='2015', time='Y')

Have menu item frequencies per platform changed over time?

In [103]:
def reset_index(df):
    '''Returns DataFrame with index as columns.'''
    index_df = df.index.to_frame(index=False)
    df = df.reset_index(drop=True)
      # In merge it's important the order in which you pass the dataframes
      # if the index contains a Categorical. 
      # pd.merge(df, index_df, left_index=True, right_index=True) does not work
    df = pd.merge(index_df, df, left_index=True, right_index=True)
    df = df.set_index(pd.to_datetime(df.date).dt.normalize(), drop=True).copy()
    df.drop('date', inplace=True, axis=1)
    return df

def plot_platforms(df, title=None, normalize=True, 
                   year_start='2011', year_end='2019',
                   time='Y'):
    """ Plot time series data for review platforms."""
    start = year_start + '-12-31'
    end = year_end + '-12-31'
    
    df['platform_name'] = df['platform_name'].astype('category')
    platforms = df.set_index(pd.to_datetime(df.date).dt.normalize(), drop=True).copy()
    mask = (platforms['date'] > start) & (platforms['date'] <= end)
    platforms = platforms.loc[mask]
    
    platforms.drop('date', inplace=True, axis=1)

    pl_df = platforms.pivot_table(index='date', columns='platform_name', 
                                  values='num_cats', aggfunc='sum')
    pl_df = reset_index(pl_df)
    pl_df = pl_df.resample(time).sum()
    pl_df['total'] = pl_df.sum(axis=1)
    
    if normalize:
        for col in pl_df.columns[:-2]:
            pl_df[col] = round((pl_df[col]/pl_df['total'])*100, 2)

    fig = go.Figure()

    for col in pl_df.columns[:-2]:
        fig.add_trace(go.Scatter(
            x=pl_df.index,
            y=pl_df[col],
            name=col))

    fig.update_layout(
        title=title,
        xaxis_title="Years",
        yaxis_title="Percentage",
        font=dict(
            family="Courier New, monospace",
            size=16,
            color="#7f7f7f"))
    
    fig.show()
    
plot_platforms(full_data_pos, title = "Menu Category Details in Positive Reviews by Platform",
              normalize=True, year_end='2018')

Which sides are being mentioned the most?

In [104]:
neg_sides = full_data_neg[full_data_neg.menu_cats.map(set(['side']).issubset)]
pos_sides = full_data_pos[full_data_pos.menu_cats.map(set(['side']).issubset)]

# create a lookup dictionary to group synonyms (for example: chips, fries, and 
# potato chips are all referring to the same item - chips) 
subs= {'hot veg': ['vegies', 'cooked vegetables', 'veggies','veg', 'vegetables','hot veg', 
                   'veggie', 'hot veggies','side veg','vegetable', 'side of vegetables', 
                   'cooked veggies','hot vegetables', 'side vegetables', 'side of veggies',
                   'hot veggie', 'carrot', 'hot vegies', 'side veggies', 'cooked veg'],   
       'baked potato': ['baked potato', 'baked potatoes', 'bake potato', 'jacket potato'], 
       'cream spinach': ['spinach', 'creamed spinach', 'cream spinach'], 
       'butternut': ['pumpkin', 'butternut'],
       'onion rings': ['onion rings', 'ring onions', 'onions rings','onion ring', 'onions ring', 
                       'ring onion'], 
       'chips': ['chips','chip', 'fries', 'potato chips', 'potato fries'],
       'sweet potato fries': ['sweet potato chips', 'sweet potato fries', 'sweetpotato chips'],
       'side dish': ['side dish','side dishes'],
       'toast': ['toast'],
       'mushrooms': ['mushrooms'],
       'side salad': ['side salad'], 
       'smileys': ['smileys']}

def invert_dict(d): 
    """ Return inverse dictionary. 
     
    Example: current_dict['chips'] = 'sweet potato fries',
             inverse_dict['sweet potato fries'] = 'chips'.
    """
    
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # If not create a new list
                inverse[item] = [key] 
            else: 
                inverse[item].append(key) 
    return inverse

inv_subs = invert_dict(subs)

def flatten(A):
    """ Flatten nested lists. """
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

def get_sides(df, inv_subs, title=None):
    """Using a lookup dict, return grouped sides."""
    sides = []
    for items, cats in zip(df.menu_items, df.menu_cats):
        for item, cat in zip(items, cats):
            if cat == 'side':
                sides.append(item)
    fdist = FreqDist(sides)
    
    replaced_sides = [inv_subs.get(item,item) for item in sides]
    flat_sides = flatten(replaced_sides)
    
    fdist = FreqDist(flat_sides)

    sides_freq = dict()
    for word, frequency in fdist.items():
        sides_freq[word] = frequency
        
    plot_menu_cats(sides_freq, title=title)
    
get_sides(neg_sides, inv_subs, title= 'Sides Mentioned in Negative Reviews')
In [53]:
get_sides(pos_sides, inv_subs, title= 'Sides Mentioned in Positive Reviews')

Have the sides mentioned in reviews changed over time?

In [105]:
def sides_df(df):
    """Return dataframe with the grouped sides per review."""
    sides_sliced = df[df.menu_cats.map(set(['side']).issubset)]
    sides_dicts = []
    for items, r_id in zip(sides_sliced.menu_items, sides_sliced.review_id):
        side_items = []
        items_replaced = []
        for item in items:
            if item in inv_subs.keys():
                side_items.append(item)
        items_replaced = [inv_subs.get(item,item) for item in side_items]
        flattened_items = flatten(items_replaced)
        unique_items = list(set(flattened_items))
        sides_dicts.append({'side_items': unique_items, 'review_id': r_id})
        
    sides_df = pd.DataFrame.from_dict(sides_dicts)
    sides_only = sides_df.merge(sides_sliced, on='review_id', how='left')
    return sides_only

neg_sides_df = sides_df(full_data_neg)

plot_cat_trends(neg_sides_df, 'side_items', list(subs.keys()),
               title = "Trends for Sides in Negative Reviews", normalize=True,
               year_start='2014', year_end='2018', time='Y')    
In [106]:
pos_sides_df = sides_df(full_data_pos)
plot_cat_trends(pos_sides_df, 'side_items', list(subs.keys()),
               title = "Trends for Sides in Positive Reviews", normalize=True,
               year_start='2009', year_end='2018', time='Y')

Has the relative percentage of negative reviews changed over time?

In [107]:
def plot_percent_negative(neg_df, pos_df, col, cat_cols=None, 
                          year_start='2011', year_end='2019', 
                          time = 'Y', normalize = 'category', title=None):
    
    start = year_start + '-12-31'
    end = year_end + '-12-31'
    
    df = neg_df.set_index(pd.to_datetime(neg_df.date).dt.normalize(), drop=True)
    mask = (df['date'] >= start) & (df['date'] <= end)
    df = df.loc[mask]
    df = df[col].str.join('|').str.get_dummies()
    neg_year_gr = df.resample(time).sum()

    df = pos_df.set_index(pd.to_datetime(pos_df.date).dt.normalize(), drop=True)
    mask = (df['date'] >= start) & (df['date'] <= end)
    df = df.loc[mask]
    df = df[col].str.join('|').str.get_dummies()
    pos_year_gr = df.resample(time).sum()
    
    if normalize == 'category':
        total = pd.DataFrame()
        for col in cat_cols:
            total[col] = pos_year_gr[col] + neg_year_gr[col]

        neg_perc = pd.DataFrame()
        for col in total.columns:
            neg_perc[col] = round((neg_year_gr[col]/total[col])*100, 2)
    
    elif normalize == 'year':
        neg_year_gr['total'] = pos_year_gr.sum(axis=1) + neg_year_gr.sum(axis=1)
        
        neg_perc = pd.DataFrame()
        for col in cat_cols:
            neg_perc[col] = round((neg_year_gr[col]/neg_year_gr['total'])*100, 2)
            
    plotly_plot(neg_perc, title)

    
# Normalize by year - adjusting for overall frequency per year, how many
# negative reviews mentioned side dish subcategories  
plot_percent_negative(neg_sides_df, pos_sides_df, 'side_items', 
                      ['butternut', 'chips', 'cream spinach', 'hot veg',
                       'mushrooms', 'onion rings'], year_start='2013',
                      year_end='2019', time='Y', 
                      normalize='category',
                      title='% Negative - All Reviews Discussing Sides')
In [111]:
# Normalize by category 
# Adjusting within subcategory (i.e., when compared to only itself)
# how have the negative reviews mentioning sides changed over the chosen timeframe? 
plot_percent_negative(neg_sides_df, pos_sides_df, 'side_items', 
                      ['butternut', 'chips', 'cream spinach', 'hot veg',
                       'mushrooms', 'onion rings'], year_start='2013',
                      year_end='2019', time='Y', 
                      normalize='year',
                      title='% Negative - All Reviews Discussing Sides')
In [110]:
# Normalized for category - Adjusting within food subcategory (i.e., comparing only to itself)
# how have the negative reviews mentioning food categories changed over the chosen timeframe? 
plot_percent_negative(full_data_neg, full_data_pos, 'menu_cats', 
                      cat_cols=['side', 'steaks', 'burger', 'breakfast', 
                                'ribs', 'chicken', 'dessert', 'sauces', 'specials'], 
                      year_start='2017', year_end='2019', time='M', 
                      normalize='category', 
                      title='% Negative - Total Reviews with Menu Categories')

Geospatial Sentiment Visualization

Return to Index

Load & Visualize Shape File

The South African geoJSON was found here.

In [5]:
# spatial file containing 51 districts
spatial_file = '...\\geospatial\\district\\layer1-2.shp'
map_df = gpd.read_file(spatial_file)

map_df.head()
Out[5]:
CATEGORY CAT2 DISTRICT MUNICNAME MAP_TITLE PROVINCE PROVNAME MapNo MUN_CD Shape_Leng Shape_Area FID geometry
0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) 4.458223 0.244546 0 POLYGON ((28.07555538279884 -32.90775949489551...
1 A District Municipality CPT City of Cape Town City of Cape Town Metropolitan Municipality WC Western Cape 249 (CPT) 5.198175 0.238297 1 (POLYGON ((18.36893483282362 -33.7899037709124...
2 C District Municipality DC1 West Coast West Coast District Municipality WC Western Cape 250 (DC1) 14.651131 2.973226 2 (POLYGON ((18.07431336270026 -33.4126182858517...
3 C District Municipality DC10 Cacadu Cacadu District Municipality EC Eastern Cape 3 (DC10) 17.874817 5.623927 3 POLYGON ((24.50631653610542 -31.70688574536319...
4 C District Municipality DC12 Amathole Amathole District Municipality EC Eastern Cape 13 (DC12) 16.058800 2.073498 4 POLYGON ((28.35197510764756 -31.82861703618579...
In [13]:
map_df.plot(figsize=(10, 8));

In [14]:
# Only use stores in South African
sa_restaurants = full_data.loc[(full_data.country == 'ZA') | \
                               (full_data.country == 'South Africa')].copy()

# Create the geometry POINTS column from the coordinates
sa_restaurants["geometry"] = sa_restaurants.apply(lambda row: Point(row["longitude"], \
                                                                    row["latitude"]), axis=1)

# Concatenate extracted menu and item data from above analysis
# and add to restaurant/ location data
full_data_pos_cut = full_data_pos[['review_id', 'menu_items', 'menu_cats',\
                                   'num_items', 'num_cats']].copy()
full_data_neg_cut = full_data_neg[['review_id', 'menu_items', 'menu_cats',\
                                   'num_items', 'num_cats']].copy()
full_data_cut = pd.concat([full_data_pos_cut, full_data_neg_cut])
places = sa_restaurants.merge(full_data_cut, on='review_id', how = 'left')

# Remove mixed sentiment reviews
places = places.loc[places.bert_label != 'mixed']

# Longitude and latitude columns no longer needed
del(places["latitude"], places["longitude"])

# Convert to geopandas dataframe and assign the geometry column
places_geo = geopandas.GeoDataFrame(places, geometry="geometry")

# Declare the coordinate system for the places GeoDataFrame
# GeoPandas doesn't do any transformations automatically when performing
# the spatial join. The layers are already in the same CRS (WGS84) so no
# transformation is needed.
places_geo.crs = {"init": "epsg:4326"}

# perform spatial join to link points to district polygons
geo_df = geopandas.tools.sjoin(map_df, places_geo, how="left")

geo_df.head()
Out[14]:
CATEGORY CAT2 DISTRICT MUNICNAME MAP_TITLE PROVINCE PROVNAME MapNo MUN_CD Shape_Leng ... bert_pos bert_neg date country city brand_id menu_items menu_cats num_items num_cats
0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) 4.458223 ... 0.000370 0.995991 2017-08-18 20:25:05+00:00 ZA East London 1.0 [salad, meat, salad] [vegetarian, meat, vegetarian] 3.0 2.0
0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) 4.458223 ... 0.998938 0.000211 2018-12-03 08:44:19+00:00 ZA East London 1.0 [] [] 0.0 0.0
0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) 4.458223 ... 0.001506 0.994791 2014-06-14 07:54:15+00:00 ZA East London 1.0 [lettuce, chicken, lettuce] [ingredients, chicken, ingredients] 3.0 2.0
0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) 4.458223 ... 0.999380 0.000127 2019-02-22 16:32:24+00:00 ZA East London 1.0 [] [] 0.0 0.0
0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) 4.458223 ... 0.994691 0.002225 2018-10-27 09:18:37+00:00 ZA East London 1.0 [] [] 0.0 0.0

5 rows × 31 columns

Aggregate Columns before Grouping by District

In [15]:
# Look at specific item
def add_cat_freq(df, category, name):
    cat = []
    for cats in df['menu_cats']:
        if category in set(cats):
            cat.append(1)
        else:
            cat.append(0)
    df[name] = cat

# choose subcategory to calculate frequency 
add_cat_freq(full_data_pos, 'steaks', 'steak_pos')
add_cat_freq(full_data_neg, 'steaks', 'steak_neg')

# get relative columns
full_data_pos_cut = full_data_pos[['review_id', 'menu_items', 'menu_cats', 'num_items', 
                                   'num_cats', 'steak_pos']].copy()
full_data_neg_cut = full_data_neg[['review_id', 'menu_items', 'menu_cats', 'num_items', 
                                   'num_cats', 'steak_neg']].copy()

# merge with geospatial data (review + district data)
geo_neg = geo_df.merge(full_data_neg_cut, on='review_id', how = 'left')
geo_neg = geo_neg.loc[geo_neg.bert_label == 'neg']

geo_pos = geo_df.merge(full_data_pos_cut, on='review_id', how = 'left')
geo_pos = geo_pos.loc[geo_pos.bert_label == 'pos']

# indexes for districts with missing data
missing_idx = [4, 6, 8, 43]

def aggregate_columns(df, idx, col):
    counts = df.groupby('FID')['review_id'].count().tolist()
    cat_col = df.groupby('FID')[col].sum().tolist()
    for i in missing_idx:
        counts.insert(i, 0)
        cat_col.insert(i, 0)
    
    assert len(counts) == 52
    assert len(cat_col) == 52
    
    return counts, cat_col

neg_counts, steak_neg = aggregate_columns(geo_neg, missing_idx, 'steak_neg')
pos_counts, steak_pos = aggregate_columns(geo_pos, missing_idx, 'steak_pos')

Group by District & Calculate Ratios/ Percentages

In [16]:
to_drop = ['CATEGORY', 'CAT2', 'DISTRICT', 'MUNICNAME', 'MAP_TITLE', 'PROVINCE',
           'PROVNAME', 'MapNo', 'MUN_CD', 'Shape_Leng', 'Shape_Area',
           'geometry', 'index_right', 'review_id', 'store_id', 'platform_name',
           'clean_text', 'combined', 'bert_label', 'bert_pos',
           'bert_neg', 'date', 'country', 'city', 'brand_id']

# add stats columns for chosen subcategory after grouping by district
def add_stats(df, neg_cat, pos_cat):
    df['prop_neg'] = (df[neg_cat]/df['neg_count'])*100
    df['prop_pos'] = (df[pos_cat]/df['pos_count'])*100
    df['percent'] = ((df[neg_cat] + df[pos_cat])/df['total_reviews'])*100                              
    df['perc_neg'] = (df[neg_cat]/(df[neg_cat] + df[pos_cat]))*100                              
    df['perc_pos'] = (df[pos_cat]/(df[neg_cat] + df[pos_cat]))*100    

geo_df2 = geo_df.copy()

geo_df2.drop(to_drop, inplace=True, axis=1)

districts_geo_df = geo_df2.groupby('FID').sum().reset_index()

# calculate ratios and percentages on grouped data so that we get values per district
districts_geo_df['neg_count'] = neg_counts
districts_geo_df['pos_count'] = pos_counts
districts_geo_df['total_reviews'] = geo_df.groupby('FID')['review_id'].count().tolist()
districts_geo_df['pos_neg_ratio'] = (districts_geo_df['pos_count']/districts_geo_df['neg_count'])
districts_geo_df['neg_pos_ratio'] = (districts_geo_df['neg_count']/districts_geo_df['pos_count'])
districts_geo_df['rating'] = districts_geo_df['rating']/districts_geo_df['total_reviews']

# add stats on steaks subcategory
districts_geo_df['steak_pos'] = steak_pos
districts_geo_df['steak_neg'] = steak_neg
add_stats(districts_geo_df, 'steak_neg', 'steak_pos')
districts_geo_df.fillna(0, inplace=True)

districts_geo_df
Out[16]:
FID rating num_items num_cats neg_count pos_count total_reviews pos_neg_ratio neg_pos_ratio steak_pos steak_neg prop_neg prop_pos percent perc_neg perc_pos
0 0 4.384615 743.0 595.0 389 2081 2470 5.349614 0.186929 57.0 24.0 6.169666 2.739068 3.279352 29.629630 70.370370
1 1 4.451702 11321.0 8824.0 5140 34862 40002 6.782490 0.147438 687.0 360.0 7.003891 1.970627 2.617369 34.383954 65.616046
2 2 4.437786 443.0 346.0 238 1948 2186 8.184874 0.122177 19.0 20.0 8.403361 0.975359 1.784081 51.282051 48.717949
3 3 4.245255 640.0 540.0 345 1657 2002 4.802899 0.208208 39.0 22.0 6.376812 2.353651 3.046953 36.065574 63.934426
4 4 0.000000 0.0 0.0 0 0 0 0.000000 0.000000 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000
5 5 4.334365 102.0 70.0 33 290 323 8.787879 0.113793 11.0 2.0 6.060606 3.793103 4.024768 15.384615 84.615385
6 6 0.000000 0.0 0.0 0 0 0 0.000000 0.000000 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000
7 7 4.715546 180.0 156.0 176 2204 2380 12.522727 0.079855 15.0 8.0 4.545455 0.680581 0.966387 34.782609 65.217391
8 8 0.000000 0.0 0.0 0 0 0 0.000000 0.000000 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000
9 9 4.404840 208.0 153.0 124 785 909 6.330645 0.157962 24.0 8.0 6.451613 3.057325 3.520352 25.000000 75.000000
10 10 4.552022 448.0 363.0 251 2296 2547 9.147410 0.109321 26.0 8.0 3.187251 1.132404 1.334904 23.529412 76.470588
11 11 4.297580 894.0 693.0 476 2582 3058 5.424370 0.184353 66.0 23.0 4.831933 2.556158 2.910399 25.842697 74.157303
12 12 4.298529 330.0 296.0 202 1158 1360 5.732673 0.174439 35.0 4.0 1.980198 3.022453 2.867647 10.256410 89.743590
13 13 4.452924 382.0 306.0 184 1218 1402 6.619565 0.151067 26.0 14.0 7.608696 2.134647 2.853067 35.000000 65.000000
14 14 4.535979 426.0 373.0 215 1675 1890 7.790698 0.128358 39.0 9.0 4.186047 2.328358 2.539683 18.750000 81.250000
15 15 3.750000 89.0 75.0 82 202 284 2.463415 0.405941 7.0 4.0 4.878049 3.465347 3.873239 36.363636 63.636364
16 16 4.079585 80.0 68.0 64 225 289 3.515625 0.284444 7.0 2.0 3.125000 3.111111 3.114187 22.222222 77.777778
17 17 3.880165 227.0 171.0 119 365 484 3.067227 0.326027 11.0 4.0 3.361345 3.013699 3.099174 26.666667 73.333333
18 18 4.337931 35.0 29.0 39 251 290 6.435897 0.155378 5.0 0.0 0.000000 1.992032 1.724138 0.000000 100.000000
19 19 4.672897 65.0 58.0 82 988 1070 12.048780 0.082996 6.0 2.0 2.439024 0.607287 0.747664 25.000000 75.000000
20 20 4.747004 438.0 388.0 245 3593 3838 14.665306 0.068188 37.0 7.0 2.857143 1.029780 1.146430 15.909091 84.090909
21 21 4.676076 397.0 330.0 169 1992 2161 11.786982 0.084839 16.0 10.0 5.917160 0.803213 1.203147 38.461538 61.538462
22 22 4.049683 389.0 286.0 223 723 946 3.242152 0.308437 15.0 13.0 5.829596 2.074689 2.959831 46.428571 53.571429
23 23 4.435609 540.0 448.0 317 2261 2578 7.132492 0.140203 40.0 12.0 3.785489 1.769129 2.017067 23.076923 76.923077
24 24 4.492339 434.0 372.0 381 2556 2937 6.708661 0.149061 26.0 15.0 3.937008 1.017214 1.395982 36.585366 63.414634
25 25 4.333333 848.0 708.0 492 2706 3198 5.500000 0.181818 76.0 20.0 4.065041 2.808574 3.001876 20.833333 79.166667
26 26 3.991400 328.0 260.0 184 630 814 3.423913 0.292063 20.0 9.0 4.891304 3.174603 3.562654 31.034483 68.965517
27 27 4.651358 186.0 138.0 155 1391 1546 8.974194 0.111431 11.0 4.0 2.580645 0.790798 0.970246 26.666667 73.333333
28 28 4.695613 338.0 308.0 219 3109 3328 14.196347 0.070441 23.0 5.0 2.283105 0.739788 0.841346 17.857143 82.142857
29 29 4.502531 510.0 428.0 316 2450 2766 7.753165 0.128980 47.0 7.0 2.215190 1.918367 1.952278 12.962963 87.037037
30 30 4.459604 637.0 533.0 350 2633 2983 7.522857 0.132928 64.0 12.0 3.428571 2.430687 2.547771 15.789474 84.210526
31 31 4.496672 163.0 142.0 137 1065 1202 7.773723 0.128638 10.0 5.0 3.649635 0.938967 1.247920 33.333333 66.666667
32 32 4.416856 93.0 78.0 57 382 439 6.701754 0.149215 12.0 4.0 7.017544 3.141361 3.644647 25.000000 75.000000
33 33 4.230154 1366.0 1106.0 559 2943 3502 5.264758 0.189942 115.0 42.0 7.513417 3.907577 4.483152 26.751592 73.248408
34 34 4.729316 650.0 549.0 366 6620 6986 18.087432 0.055287 54.0 23.0 6.284153 0.815710 1.102204 29.870130 70.129870
35 35 4.479184 971.0 827.0 766 5359 6125 6.996084 0.142937 58.0 38.0 4.960836 1.082291 1.567347 39.583333 60.416667
36 36 4.691909 70.0 63.0 59 905 964 15.338983 0.065193 5.0 1.0 1.694915 0.552486 0.622407 16.666667 83.333333
37 37 4.328244 200.0 162.0 118 406 524 3.440678 0.290640 12.0 8.0 6.779661 2.955665 3.816794 40.000000 60.000000
38 38 4.495910 159.0 138.0 112 866 978 7.732143 0.129330 27.0 2.0 1.785714 3.117783 2.965235 6.896552 93.103448
39 39 4.424242 95.0 80.0 62 400 462 6.451613 0.155000 7.0 0.0 0.000000 1.750000 1.515152 0.000000 100.000000
40 40 4.578172 1089.0 956.0 762 6165 6927 8.090551 0.123601 59.0 24.0 3.149606 0.957015 1.198210 28.915663 71.084337
41 41 3.967593 206.0 173.0 105 327 432 3.114286 0.321101 18.0 5.0 4.761905 5.504587 5.324074 21.739130 78.260870
42 42 4.387097 51.0 39.0 31 186 217 6.000000 0.166667 5.0 3.0 9.677419 2.688172 3.686636 37.500000 62.500000
43 43 0.000000 0.0 0.0 0 0 0 0.000000 0.000000 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000
44 44 4.317308 153.0 138.0 111 617 728 5.558559 0.179903 13.0 6.0 5.405405 2.106969 2.609890 31.578947 68.421053
45 45 4.187569 228.0 194.0 150 751 901 5.006667 0.199734 28.0 9.0 6.000000 3.728362 4.106548 24.324324 75.675676
46 46 4.629339 4285.0 3578.0 2058 22142 24200 10.758989 0.092946 349.0 104.0 5.053450 1.576190 1.871901 22.958057 77.041943
47 47 4.620823 4837.0 3976.0 2521 23815 26336 9.446648 0.105858 331.0 152.0 6.029353 1.389880 1.833991 31.469979 68.530021
48 48 4.677870 6882.0 5849.0 4613 52274 56887 11.331888 0.088247 514.0 194.0 4.205506 0.983280 1.244573 27.401130 72.598870
49 49 4.773077 1560.0 1347.0 655 14024 14679 21.410687 0.046706 92.0 31.0 4.732824 0.656018 0.837932 25.203252 74.796748
50 50 4.347404 1253.0 1021.0 810 3928 4738 4.849383 0.206212 87.0 63.0 7.777778 2.214868 3.165893 42.000000 58.000000
51 51 4.612723 4747.0 3983.0 2795 29084 31879 10.405725 0.096101 365.0 114.0 4.078712 1.254986 1.502557 23.799582 76.200418

Load Geojson & Merge with Shape File

In [17]:
# load geojson for Plotly
with open('...\\geospatial\\layer1-2.json') as response:
    districts = json.load(response)

# merge with full data (processed geospatial + review + calculations)
merged = map_df.set_index('FID').join(districts_geo_df.set_index('FID'))
merged.reset_index(inplace=True)
merged.fillna(0, axis=0, inplace=True)

merged.head()
Out[17]:
FID CATEGORY CAT2 DISTRICT MUNICNAME MAP_TITLE PROVINCE PROVNAME MapNo MUN_CD ... total_reviews pos_neg_ratio neg_pos_ratio steak_pos steak_neg prop_neg prop_pos percent perc_neg perc_pos
0 0 A District Municipality BUF Buffalo City Buffalo City Metropolitan Municipality EC Eastern Cape 1 (BUF) ... 2470 5.349614 0.186929 57.0 24.0 6.169666 2.739068 3.279352 29.629630 70.370370
1 1 A District Municipality CPT City of Cape Town City of Cape Town Metropolitan Municipality WC Western Cape 249 (CPT) ... 40002 6.782490 0.147438 687.0 360.0 7.003891 1.970627 2.617369 34.383954 65.616046
2 2 C District Municipality DC1 West Coast West Coast District Municipality WC Western Cape 250 (DC1) ... 2186 8.184874 0.122177 19.0 20.0 8.403361 0.975359 1.784081 51.282051 48.717949
3 3 C District Municipality DC10 Cacadu Cacadu District Municipality EC Eastern Cape 3 (DC10) ... 2002 4.802899 0.208208 39.0 22.0 6.376812 2.353651 3.046953 36.065574 63.934426
4 4 C District Municipality DC12 Amathole Amathole District Municipality EC Eastern Cape 13 (DC12) ... 0 0.000000 0.000000 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000

5 rows × 28 columns

Plotly Chloropleth - Pos:Neg Review Ratio

Return to Index

In [18]:
color = "Viridis"

fig = go.Figure(go.Choroplethmapbox(geojson=districts, #provinces,   # 
                                    locations=merged.FID, 
                                    z=merged.pos_neg_ratio,
                                    text=merged.MUNICNAME,
                                    name='',
                                    below='traces',
                                    colorscale=color, zmin=0, zmax=15,
                                    colorbar = dict(thickness=20, ticklen=3),
                                    hovertemplate = '<b>District</b>: <b>%{text}</b>'+
                                            '<br><b>Val </b>: %{z}<br>',
                                    hoverinfo = 'text',
                                    marker_opacity=0.7, marker_line_width=0.1))

fig.update_layout(mapbox = {
                'style':"carto-positron",
                'zoom': 5,
                'center': {"lat": -30.5595, "lon": 22.9375},
                'layers': [
                      {'source': {
                            'type': "FeatureCollection",
                            'sourcetype': 'geojson',
                            'features': districts['features']},
                             'type': 'fill', 'below': 'traces', 'color': 'lightgreen'}]})

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

Geopandas Chloropleth - Sub-menu Sentiment: Steak

Return to Index

In [23]:
# set a variable that will call whatever column we want to visualise on the map
variable = 'perc_neg'
# set the range for the choropleth
vmin, vmax = 0, 50
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(14, 10))

# create map
color = "Blues"
merged.plot(variable, cmap=color, linewidth=0.2, ax=ax, edgecolor='0.1', alpha=0.8)

ax.axis('off')

# add a title
ax.set_title('% of Reviews Mentioning Steaks that are Negative', 
             fontdict={'fontsize': '25', 'fontweight' : '3'})

# Create colorbar as a legend
sm = plt.cm.ScalarMappable(cmap=color, norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm._A = []
# add the colorbar to the figure
cbar = fig.colorbar(sm)

Prepare Data for Tableau

In [14]:
# Expand categories into multiple rows: for example, if one review 
# has the categories: sides and steak, there will be two rows with 
# the same review - one for sides and one for steak. 

# create pandas dataframe containing geometery information
# to export into Tableau
geo_pd = pd.DataFrame(geo_df)
geo_pd.reset_index(inplace=True, drop=True)

def explode(df, lst_cols, fill_value='', preserve_index=False):
    """ Transform each element of a list-like to a row, 
    replicating the index values and other fields. 
    
    Return dataframe with exploded category."""
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

def expand_cats(df, col):
    """ Preprocesses dataframe and explode chosen category. 
    
    Ensure null values are replaced with empty strings.
    Ensure that a category is only exploded once for a 
    single sample."""
    df[col].fillna('', inplace=True)
    df[col] = df[col].apply(set)
    df[col] = df[col].apply(list)
    return explode(df, col)

geo_pd = expand_cats(geo_pd, 'menu_cats')
#geo_pd.to_csv('geospatial_reviews.csv')